The following R libraries have been used for the analysis.
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
cache=TRUE,
autodep = TRUE,
screenshot.force = TRUE,
out.width="100%",
dpi=300,
cache.lazy = FALSE
)
# Install CRAN packages
list.of.packages <- c(
"data.table",
"tidyverse",
"lubridate",
"plotly",
"leaflet",
"geosphere",
"ggthemes",
"ggcorrplot",
"webshot",
"caret",
"xgboost",
"car",
"tidygeocoder"
)
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages, repos = "https://cran.rstudio.com/", dependencies = T)
suppressWarnings(suppressMessages(suppressPackageStartupMessages({
library(data.table)
library(tidyverse)
library(lubridate)
library(plotly)
library(leaflet)
library(geosphere)
library(ggthemes)
library(ggcorrplot)
library(webshot)
library(caret)
library(xgboost)
library(car)
library(tidygeocoder)
})))
The zipped files were downloaded manually from the link provided and decompressed to extract the CSVs.
The CSVs are loaded in to memory using the fread function from the data.table package
trip_fare <- fread('../Data/trip_fare_3.csv')
trip_data <- fread('../Data/trip_data_3.csv')
Let’s merge the two datasets to create a single dataset containing all the information.
Since the data does not comes with a data dictionary, we will try to define the primary key for the two datasets. Ideally, a unique trip can be identified by taxi identifier, with a given pick-up datetime.
Based on the primary keys, we will identify if we have duplicates in the two datasets before merging them.
join_keycols <- names(trip_fare)[1:4]
setkeyv(trip_fare, join_keycols)
setkeyv(trip_data, join_keycols)
# Number of records in trip fare dataset
trip_fare[, .N]
#15100468
# Unique records in trip fare dataset by the primary key
uniqueN(trip_fare, by = key(trip_fare))
#15099816
# Number of records in trip data dataset
trip_data[,.N]
#15100468
# Unique records in trip data dataset by the primary key
uniqueN(trip_data, by = key(trip_data))
#15099816
Looking at the statistics above and based on the defined primary key to identify a unique trip, we do see some duplicates in both the datasets. We need to ensure the duplicates are removed and both datasets contains same trips before merging them.
trip_fare <- unique(trip_fare, by = key(trip_fare))
trip_data <- unique(trip_data, by = key(trip_data))
# Inner JOIN makes sure that we have only the common trips from both the datasets
trip_combined <- trip_fare[trip_data, nomatch = 0]
trip_combined[, .N]
#15099816
Since the dataset is large and resources are limited, we will work with a sample of the data instead of the entire dataset. Here, we are taking a random sample of 10% of the entire dataset. Seed is set before taking the sample to make sure the results are reproducible everytime the code is re-run. We will save the sample dataset to a file so that we don’t have to repeat these steps everytime the code is re-run and instead read the sample dataset directly from disk.
set.seed(131)
# Randomly select 10% of the records
trip_combined[, sample_flg := sample(c(TRUE, FALSE), size = .N, replace = TRUE, prob = c(0.1, 0.9))]
# Subset the 10% of the records as a sampe dataset
trip_combined_sample <- trip_combined[sample_flg == T]
# Save the contents to disk
fwrite(trip_combined_sample, '../Data/trip_combined_sample.csv')
# Compress the csv
system(sprintf("gzip -f ../Data/trip_combined_sample.csv"))
trip_combined_sample <- fread('../Data/trip_combined_sample.csv.gz')
Let’s look at the data type of the different variables we have in the dataset.
glimpse(trip_combined_sample)
## Observations: 1,573,406
## Variables: 22
## $ medallion <chr> "00005007A9F30E289E760362F69E4EAD", "00005007A9F30…
## $ hack_license <chr> "16780B3E72BAA7A5C152C197919DCA0E", "16780B3E72BAA…
## $ vendor_id <chr> "CMT", "CMT", "CMT", "CMT", "CMT", "CMT", "CMT", "…
## $ pickup_datetime <chr> "2013-03-27 20:21:29", "2013-03-28 01:00:10", "201…
## $ payment_type <chr> "CSH", "CRD", "CRD", "CRD", "CRD", "CSH", "CRD", "…
## $ fare_amount <dbl> 9.5, 8.5, 4.0, 7.0, 7.5, 9.0, 6.5, 5.5, 6.5, 4.0, …
## $ surcharge <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, …
## $ mta_tax <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, …
## $ tip_amount <dbl> 0.0, 1.0, 1.0, 1.6, 1.7, 0.0, 1.4, 1.5, 1.4, 0.0, …
## $ tolls_amount <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_amount <dbl> 10.5, 10.5, 6.0, 9.6, 10.2, 9.5, 8.4, 7.5, 8.4, 5.…
## $ rate_code <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ store_and_fwd_flag <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", …
## $ dropoff_datetime <chr> "2013-03-27 20:28:09", "2013-03-28 01:08:41", "201…
## $ passenger_count <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 2,…
## $ trip_time_in_secs <int> 399, 511, 166, 324, 424, 507, 491, 270, 395, 181, …
## $ trip_distance <dbl> 2.7, 1.9, 0.3, 1.5, 1.3, 2.0, 0.7, 0.9, 1.1, 0.3, …
## $ pickup_longitude <dbl> -73.99927, -74.00079, -74.00384, -73.96951, -73.97…
## $ pickup_latitude <dbl> 40.75468, 40.73635, 40.70779, 40.78532, 40.79454, …
## $ dropoff_longitude <dbl> -73.97197, -73.98277, -74.00693, -73.95475, -73.94…
## $ dropoff_latitude <dbl> 40.78463, 40.72009, 40.70973, 40.80033, 40.78533, …
## $ sample_flg <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
High Level Key Statistics
# Trips
trip_combined_sample[,.N]
## [1] 1573406
# Vendor
trip_combined_sample[, uniqueN(vendor_id)]
## [1] 2
# Medallions
trip_combined_sample[, uniqueN(medallion)]
## [1] 13379
# Drivers
trip_combined_sample[, uniqueN(hack_license)]
## [1] 32582
# Trip distance total
trip_combined_sample[, sum(trip_distance)]
## [1] 4474955
The date times are read as strings (character) format. Let’s convert date strings to date time format in R
trip_combined_sample[, pickup_datetime := as.POSIXct(pickup_datetime,format="%Y-%m-%d %H:%M:%OS")]
trip_combined_sample[, dropoff_datetime := as.POSIXct(dropoff_datetime,format="%Y-%m-%d %H:%M:%OS")]
Summary of data shows that the taxi trips are recorded for only a month, and is from March 2013.
summary(trip_combined_sample)
## medallion hack_license vendor_id
## Length:1573406 Length:1573406 Length:1573406
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## pickup_datetime payment_type fare_amount
## Min. :2013-03-01 00:00:00 Length:1573406 Min. : 2.50
## 1st Qu.:2013-03-08 15:30:15 Class :character 1st Qu.: 6.50
## Median :2013-03-16 02:13:56 Mode :character Median : 9.00
## Mean :2013-03-16 08:09:05 Mean : 12.04
## 3rd Qu.:2013-03-23 20:53:00 3rd Qu.: 13.50
## Max. :2013-03-31 23:59:51 Max. :450.00
##
## surcharge mta_tax tip_amount tolls_amount
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.:0.5000 1st Qu.: 0.00 1st Qu.: 0.0000
## Median :0.0000 Median :0.5000 Median : 0.90 Median : 0.0000
## Mean :0.3193 Mean :0.4984 Mean : 1.31 Mean : 0.2351
## 3rd Qu.:0.5000 3rd Qu.:0.5000 3rd Qu.: 2.00 3rd Qu.: 0.0000
## Max. :5.5000 Max. :0.5000 Max. :198.69 Max. :20.0000
##
## total_amount rate_code store_and_fwd_flag
## Min. : 2.50 Min. : 0.000 Length:1573406
## 1st Qu.: 8.00 1st Qu.: 1.000 Class :character
## Median : 11.00 Median : 1.000 Mode :character
## Mean : 14.41 Mean : 1.034
## 3rd Qu.: 16.00 3rd Qu.: 1.000
## Max. :468.00 Max. :210.000
##
## dropoff_datetime passenger_count trip_time_in_secs
## Min. :2013-03-01 00:02:00 Min. :0.000 Min. : 0
## 1st Qu.:2013-03-08 15:43:04 1st Qu.:1.000 1st Qu.: 360
## Median :2013-03-16 02:25:00 Median :1.000 Median : 600
## Mean :2013-03-16 08:21:06 Mean :1.712 Mean : 718
## 3rd Qu.:2013-03-23 21:05:00 3rd Qu.:2.000 3rd Qu.: 924
## Max. :2013-04-01 00:25:05 Max. :6.000 Max. :10440
##
## trip_distance pickup_longitude pickup_latitude dropoff_longitude
## Min. : 0.000 Min. :-2033.6239 Min. :-3113.71 Min. :-1859.27
## 1st Qu.: 1.020 1st Qu.: -73.9922 1st Qu.: 40.73 1st Qu.: -73.99
## Median : 1.770 Median : -73.9820 Median : 40.75 Median : -73.98
## Mean : 2.844 Mean : -72.6652 Mean : 40.03 Mean : -72.63
## 3rd Qu.: 3.190 3rd Qu.: -73.9672 3rd Qu.: 40.77 3rd Qu.: -73.96
## Max. :90.000 Max. : 0.0004 Max. : 1847.05 Max. : 0.00
## NA's :30
## dropoff_latitude sample_flg
## Min. :-3084.32 Mode:logical
## 1st Qu.: 40.73 TRUE:1573406
## Median : 40.75
## Mean : 40.01
## 3rd Qu.: 40.77
## Max. : 1900.35
## NA's :30
The summary statistics shows that there are missing values only for two of the variables, dropoff_longitude and dropoff_latitude There are techniques to analyse missing values with missing at random or missing completely at random or missing not at random. And, there are techniques to treat the missing values like removing the entire record containing missing values or imputing the missing values with mean, median or some fixed values. But for the purpose of this analysis, we will simple remove them as the missing obersvations are a tiny proportion of the sample dataset.
# Remove the null values from drop-off lat and long
trip_combined_sample <- trip_combined_sample[!is.na(dropoff_longitude) & !is.na(dropoff_latitude)]
From the summary statistics, looking at the range of the values of lat and long, it appears the data is either incorrect or the lat long are captured in a multiple formats.
Assuming lat and long are captured in degrees (decimal) format, we will remove any erroneous data that does not conforms with the degrees (decimal) format.
Since the latitudes range from -90 to 90 and longitudes range from -180 to 180, any data that is outside that range will be removed from the dataset.
# Lat long range (lat from -90 to 90 and long from -180 to 180)
trip_combined_sample <- trip_combined_sample[pickup_latitude %between% c(-90, 90) & pickup_longitude %between% c(-180, 180)]
trip_combined_sample <- trip_combined_sample[dropoff_latitude %between% c(-90, 90) & dropoff_longitude %between% c(-180, 180)]
The range of lat and long is now within the standard range but it is still too large considering the data is only from New York taxi services.
summary(trip_combined_sample$pickup_latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5633 40.7346 40.7523 40.0338 40.7667 73.9960
summary(trip_combined_sample$pickup_longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -97.10000 -73.99219 -73.98200 -72.66151 -73.96721 0.00044
summary(trip_combined_sample$dropoff_latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5633 40.7337 40.7526 40.0124 40.7676 74.0151
summary(trip_combined_sample$dropoff_longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -100.38 -73.99 -73.98 -72.62 -73.96 0.00
The histograms and box plots for the various lat and longs shows that most of the data points are concentatrated around a specific cordinates but some of the trips are spread all over the world.
hist(trip_combined_sample$pickup_latitude,
main="Histogram for Pickup Latitude",
xlab="Pickup Latitude",
border="black",
col="lightblue")
ggplot(trip_combined_sample, aes(x="pickup_latitude", y=pickup_latitude)) +
geom_boxplot() +
labs(x = "", y = "")+
ggtitle("Box Plot for Pickup Latitude") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
scale_colour_tableau()
We know that NYC is in northern hemisphere. Some of the pickups are from equator (latitude = 0) and some are from southern hemisphere.
Clearly data is either wrongly captured or there were errors in capturing the data.
hist(trip_combined_sample$pickup_longitude,
main="Histogram for Pickup Longitude",
xlab="Pickup Longitude",
border="black",
col="lightblue")
ggplot(trip_combined_sample, aes(x="pickup_longitude", y=pickup_longitude)) +
geom_boxplot() +
labs(x = "", y = "")+
ggtitle("Box Plot for Pickup Longitude") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
scale_colour_tableau()
hist(trip_combined_sample$dropoff_latitude,
main="Histogram for Dropoff Latitude",
xlab="Dropoff Latitude",
border="black",
col="lightblue")
ggplot(trip_combined_sample, aes(x="dropoff_latitude", y=dropoff_latitude)) +
geom_boxplot() +
labs(x = "", y = "")+
ggtitle("Box Plot for Dropoff Latitude") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
scale_colour_tableau()
hist(trip_combined_sample$dropoff_longitude,
main="Histogram for Dropoff Longitude",
xlab="Dropoff Longitude",
border="black",
col="lightblue")
ggplot(trip_combined_sample, aes(x="dropoff_longitude", y=dropoff_longitude)) +
geom_boxplot() +
labs(x = "", y = "")+
ggtitle("Box Plot for Dropoff Longitude") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
scale_colour_tableau()
Since we are dealing with New York data, the lat and long can further be limited to New York and nearby. NYC coordinates are 40.7141667 lat and -74.0063889 long and if we restrict the pickups and drops within 1000 km radius, we should have a reliable dataset about the NYC taxi trips only
# NYC range within 1000 kms radius (1 degree equal appox 111 kms) and NYC coordinates are 40.7141667 lat and -74.0063889 long
trip_combined_sample <- trip_combined_sample[pickup_latitude %between% c(30, 50) & pickup_longitude %between% c(-84, -64)]
trip_combined_sample <- trip_combined_sample[dropoff_latitude %between% c(30, 50) & dropoff_longitude %between% c(-84, -64)]
#boxplot.stats(trip_combined_sample$pickup_longitude)
Let’s visualize the pick-up lat longs on the map. We can see that most of trips are concentrated around Manhattan borough (often referred as the city) and some of the pick-ups are from nearby airports.
leaflet(data = head(trip_combined_sample, 1000)) %>%
addTiles()%>%
#addProviderTiles("Stamen.TonerLite")%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircleMarkers(~ pickup_longitude, ~pickup_latitude, radius = 1,
color = "navy", fillOpacity = 0.3)
Some of the trips have a trip distance of 0 and some have trip duration of 0. For this analysis, we will drop them from our datasets
# Remove zero distance trips
trip_combined_sample <- trip_combined_sample[trip_distance > 0]
# Remove zero time trips
trip_combined_sample <- trip_combined_sample[trip_time_in_secs > 0]
Most of trips are single passenger trips. We do have 0 passenger trips. Maybe, they are trips for some delivery and does not include any passengers. And, there are some trips with 9 passengers. We are not excluding any trips based on passenger count as all of these seems to be genuine trips and may contribute towards the fare amount predictions.
# Count trips group by passenger count
plot_passenger_dist <- trip_combined_sample[, .N, by = list(passenger_count)][,prop := round(N/sum(N),4)]
# Convert the passenger count to a categorical variable
plot_passenger_dist$passenger_count <- as.factor(plot_passenger_dist$passenger_count)
# Visualize
ggplot(plot_passenger_dist, aes(passenger_count, prop, fill = passenger_count)) +
geom_col() +
scale_y_continuous(labels = scales::percent)+
labs(x = "Number of Passengers", y = "Number of Trips (March 2013)")+
ggtitle("Number of Passengers/Trip") +
theme_bw() +
theme(panel.border = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
scale_colour_tableau()
Since there is no data dictionary for the abbreviated payment types. We are assuming CRD means a credit card transaction and CSH mean a cash transaction. UNK might be Unknown transactions. Most of the transactions are either CRD or CSH.
# Count trips group by payment type
plot_payment_type_dist <- trip_combined_sample[, .N, by = list(payment_type)]
# Convert the payment type to a categorical variable
plot_payment_type_dist$payment_type <- as.factor(plot_payment_type_dist$payment_type)
# Visualize
ggplot(plot_payment_type_dist, aes(payment_type, N/sum(N), fill = payment_type)) +
geom_col() +
scale_y_continuous(labels = scales::percent)+
labs(x = "Payment Type", y = "Number of Trips")+
ggtitle("Payment Type/Trip") +
theme_bw() +
theme(panel.border = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
scale_colour_tableau()
Most of the trips are under $80 range. Some of the high fare amounts (>$100) needs to be explored further. So the fare amount should be a function of trip distance and trip duration. We will have a look if the high fares are justified by their corresponding trip distances and durations or if they are outliers.
ggplot(trip_combined_sample, aes(fare_amount)) +
geom_histogram(binwidth = 20,
col="black",
fill="light blue") +
labs(x = "Fare Amount", y = "Number of Trips")+
ggtitle("Histogram of Fare Amount") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
scale_colour_tableau()
Most of the tips are under $10 range. Some of the high tip amounts needs to be explored further along with fare amount to detect any outliers.
ggplot(trip_combined_sample, aes(tip_amount)) +
geom_histogram(binwidth = 10,
col="black",
fill="light blue") +
labs(x = "Tip Amount", y = "Number of Trips")+
ggtitle("Histogram of Tip Amount") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
scale_colour_tableau()
Most of the total amounts are under $100 range and are along the lines of fare amount.
ggplot(trip_combined_sample, aes(total_amount)) +
geom_histogram(binwidth = 20,
col="black",
fill="light blue") +
labs(x = "Total Amount", y = "Number of Trips")+
ggtitle("Histogram of Total Amount") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none") +
scale_colour_tableau()
We know that fare amount comprises of multiple price components such as starting fare, fare per miles/kms, waiting fares, tolls, surcharges and other fees. The major component would be fare per miles/kms (for longer trips). Let’s try to compute a fare per unit distance and try to find any abnormalities in the data that can impact predicting the fare prices. We will try to remove any records that are outside 2 standard deviations.
trip_combined_sample[, fare_per_dist := (fare_amount/trip_distance)]
# Calculate 2 standard deviations
trip_combined_sample[,.(min_fare = min(fare_per_dist),
max_fare = max(fare_per_dist),
sd_2_fare_upper = mean(fare_per_dist) + 2 * sd(fare_per_dist),
sd_2_fare_lower = mean(fare_per_dist) - 2 * sd(fare_per_dist)
)
]
## min_fare max_fare sd_2_fare_upper sd_2_fare_lower
## 1: 0.02777778 10000 46.03531 -34.19328
# Visualize if the trip duration has an impact on the high fare_per_dist for lower trip distances
ggplot(head(trip_combined_sample, 10000)) +
geom_point(aes(x=trip_distance, y=fare_per_dist, col = (trip_time_in_secs/60))) +
labs(x = "Trip Distance", y = "Fare amount per unit distance") +
labs(col = "Trip Time (in mins)") +
ggtitle("Fare amount per unit distance Vs trip distance & time") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))
Looking at the chart above, it seems that some of the high fare does not have that high trip durations. They seem to be outliers. Let’s remove the extreme tail ends of fare_per_dist (2 standard deviations)
trip_combined_sample <- trip_combined_sample[fare_per_dist < (mean(fare_per_dist) + 2 * sd(fare_per_dist))]
To analyse the impact of time/day on the number of trips, let’s create new features/variables and see the impact of day of week on the volume of trips made.
trip_combined_sample[, pickup_wday := wday(pickup_datetime, label = TRUE)]
plot_wday_trips <- trip_combined_sample[, .N, by = list(pickup_wday, vendor_id)]
ggplot(plot_wday_trips, aes(pickup_wday, N, colour = vendor_id)) +
geom_point(size = 4) +
labs(x = "Day of the week", y = "Total number of pickups") +
labs(col = "Vendor") +
ggtitle("Taxi trips by day of the week") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5))
Looking at the chart above, it shows that Saturdays were the busiest days for the month of March, 2013 followed by Friday and Sundays.
As the fares are dependent on the time of day (odd hours fares are higher than normal business hours fares), let’s compute the hour of the day feature/variable to see the impact of time of the day on the volumes of trips and average fare amount.
trip_combined_sample[, pickup_hour := hour(pickup_datetime)]
plot_hourly_trips <- trip_combined_sample[, .N, by = list(pickup_hour)]
ggplot(plot_hourly_trips, aes(reorder(pickup_hour, -N), N, fill = reorder(pickup_hour, -N))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Hour of the day", y = "Total number of pickups") +
ggtitle("Taxi trips by hour of the day") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
From the chart, it is clear that the busiest hours are 7pm, 6pm, 8pm, 9pm and 10 pm. In short the evening times are the busiest from 6-10pm
plot_hourly_fares <- trip_combined_sample[, .(mean_hourly_fare = mean(fare_amount)), by = list(pickup_hour)]
ggplot(plot_hourly_fares, aes(reorder(pickup_hour, -mean_hourly_fare), mean_hourly_fare, fill = reorder(pickup_hour, -mean_hourly_fare))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Hour of the day", y = "Average Fare Amount") +
ggtitle("Fare amount by hour of the day") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
Early morning fare averages are high peaking at 5am.
plot_hourly_trip_duration <- trip_combined_sample[, .(mean_hourly_trip_duration = mean(trip_time_in_secs)), by = list(pickup_hour)]
ggplot(plot_hourly_trip_duration, aes(reorder(pickup_hour, -mean_hourly_trip_duration), mean_hourly_trip_duration, fill = reorder(pickup_hour, -mean_hourly_trip_duration))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Hour of the day", y = "Average Trip Duration") +
ggtitle("Trip Duration by hour of the day") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
Trip in the afternoons till evenings are longer duration trips, probably because of traffic.
plot_wday_hour_trips <- trip_combined_sample[, .N, by = list(pickup_wday, pickup_hour)]
ggplot(plot_wday_hour_trips, aes(pickup_wday, pickup_hour, fill= N, text = N)) +
geom_tile()+
scale_fill_gradient(low="Light Blue", high="Dark Blue")+
labs(x = "Day of the Week", y = "Hour of the Day") +
labs(fill = "Trips")
Fridays and Weekends are the busiest and trips in evenings 6-10pm are at peak.
Here are the top 10 locations/postcodes by pickup
trip_combined_sample[, pickup_latitude_rd := round(pickup_latitude,2)]
trip_combined_sample[, pickup_longitude_rd := round(pickup_longitude,2)]
top_10_pickups <- head(trip_combined_sample[, .N, by = list(pickup_latitude_rd, pickup_longitude_rd)][order(-N)],10)
setDF(top_10_pickups)
top_10_pickups_rev_geo <- top_10_pickups%>%reverse_geocode(pickup_latitude_rd, pickup_longitude_rd, method = 'osm',
address = address_found, full_results = TRUE)
setDT(top_10_pickups_rev_geo)
top_10_pickups_rev_geo[, list(pickup_latitude_rd, pickup_longitude_rd, trips = N, address_found, postcode)]
## pickup_latitude_rd pickup_longitude_rd trips
## 1: 40.76 -73.97 93478
## 2: 40.75 -73.99 88867
## 3: 40.76 -73.98 79883
## 4: 40.75 -73.98 77431
## 5: 40.74 -73.99 71742
## 6: 40.76 -73.99 70553
## 7: 40.73 -73.99 58340
## 8: 40.77 -73.96 51885
## 9: 40.73 -74.00 50254
## 10: 40.74 -74.00 47547
## address_found
## 1: 664, Lexington Avenue, Manhattan Community Board 5, Manhattan, New York, 10022, United States
## 2: 137, West 33rd Street, Manhattan Community Board 5, Manhattan, New York County, New York, 10001, United States
## 3: 1270 Avenue of the Americas, 1270, 6th Avenue, Midtown, Manhattan Community Board 5, Manhattan, New York, 10020, United States
## 4: 35, East 38th Street, Murray Hill, Manhattan Community Board 6, Manhattan, New York County, New York, 10016, United States
## 5: 10, East 21st Street, Manhattan Community Board 5, Manhattan, New York, 10010, United States
## 6: 341, West 45th Street, Theater District, Hell's Kitchen, Manhattan, New York, 10036, United States
## 7: 51 Astor Place, East 9th Street, East Village, Manhattan Community Board 3, Manhattan, New York, 10003, United States
## 8: Dallas BBQ, 1265, 3rd Avenue, Carnegie Hill, Manhattan Community Board 8, Manhattan, New York, 10021, United States
## 9: Filomen D'agostino Residence Hall, 110, West 3rd Street, Washington Square Village, Manhattan Community Board 2, Manhattan, New York, 10012, United States
## 10: 218, West 16th Street, Chelsea District, Manhattan, New York, 10011, United States
## postcode
## 1: 10022
## 2: 10001
## 3: 10020
## 4: 10016
## 5: 10010
## 6: 10036
## 7: 10003
## 8: 10021
## 9: 10012
## 10: 10011
To see which at the busiest locations for taxi pickup and the impact of location on fare amount, let us compute a feature/variable which depicts the distance from city center. The hypothesis is that the locations closer to city center will be the busiest.
# New York City coordinates
nyc = c(-74.0063889, 40.7141667)
# Calculate distance from the city
trip_combined_sample$pickup_dist_from_city <- distm(trip_combined_sample[,list(pickup_longitude, pickup_latitude)], nyc, fun = distHaversine)
Converting distance in meters to kilometers
# Convert distance to km
trip_combined_sample[, pickup_dist_from_city := round(pickup_dist_from_city/1000, 2)]
summary(trip_combined_sample$pickup_dist_from_city)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.070 4.990 5.407 6.970 1210.840
The summary shows the maximum distance of a pickup from city center is 1200 km and is consistent with the ~1000km radius we applied to the sample dataset based on lat and long degrees. 75% of the pickups are within 7km from city centre.
trip_combined_sample[, km_bin := cut(trip_combined_sample$pickup_dist_from_city,
seq(0, max(pickup_dist_from_city) + 2, by = 2),
include.lowest = TRUE,
right = FALSE)]
plot_distance_from_city <- trip_combined_sample[pickup_dist_from_city <=30, .N, by = list(km_bin)]
ggplot(plot_distance_from_city, aes(reorder(km_bin, -N), N, fill = reorder(km_bin, -N))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Distance from city (km) 2 km bins", y = "Number of pickups") +
ggtitle("Number of pickups by distance from city") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
The above chart shows top ten locations (in terms of radial distance from NYC center). The top 5 busiest locations are within 10km radius of the New York city center.
Looking at the chart below, the fare amount has a fairly high linear relationship with trip distance and trip duration which is actually true as fare amount is directly proportional to the distance travelled and time taken by the trip.
trip_combined_sample_cormat <- trip_combined_sample[,
list(rate_code,
pickup_datetime = as.numeric(pickup_datetime),
dropoff_datetime = as.numeric(dropoff_datetime),
passenger_count, trip_time_in_secs, trip_distance,
pickup_longitude, pickup_latitude,
dropoff_longitude, dropoff_latitude,
pickup_wday = as.numeric(trip_combined_sample$pickup_wday),
pickup_hour, pickup_dist_from_city, fare_amount)]
corr <- cor(trip_combined_sample_cormat, use = "pairwise.complete.obs")
ggcorrplot(corr, hc.order = FALSE, type = "lower",
ggtheme = ggthemes::theme_gdocs,
colors = c("#ff7f0e", "white", "#1f83b4"),
lab = TRUE) +
ggtitle("Correlation Matrix") +
theme(panel.grid.major=element_blank(),
plot.title = element_text(hjust = 0.5))
Let’s try to build a model for predicting the fare amount based on all the continous variables as shown in the above chart and also try to build a model using just the highly correlated independent variables trip_distance and trip_time_in_secs
We will use 80% of the sample data to train the model and 20% of the data to test the model performance.
# Setting seed for reproducible results
set.seed(131)
training_samples <- createDataPartition(trip_combined_sample_cormat$fare_amount, p = 0.8, list = FALSE)
train_data <- trip_combined_sample_cormat[training_samples, ]
test_data <- trip_combined_sample_cormat[-training_samples, ]
For the first model, we will use all the continous variables
# Train the model
model_1<-lm(fare_amount~., data = train_data)
model_summary <- summary(model_1)
model_summary
##
## Call:
## lm(formula = fare_amount ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442.84 -0.33 -0.07 0.20 163.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.040e+01 7.646e+00 -6.591 4.36e-11 ***
## rate_code 2.116e+00 6.155e-03 343.867 < 2e-16 ***
## pickup_datetime -7.372e-05 1.506e-05 -4.894 9.88e-07 ***
## dropoff_datetime 7.374e-05 1.506e-05 4.895 9.81e-07 ***
## passenger_count -1.940e-02 1.481e-03 -13.097 < 2e-16 ***
## trip_time_in_secs 5.796e-03 1.636e-05 354.399 < 2e-16 ***
## trip_distance 1.916e+00 1.123e-03 1705.065 < 2e-16 ***
## pickup_longitude 6.561e-01 5.662e-02 11.587 < 2e-16 ***
## pickup_latitude 8.695e-01 6.121e-02 14.205 < 2e-16 ***
## dropoff_longitude -6.448e-01 5.453e-02 -11.824 < 2e-16 ***
## dropoff_latitude -3.045e-01 5.402e-02 -5.637 1.74e-08 ***
## pickup_wday -5.109e-04 9.912e-04 -0.515 0.606
## pickup_hour -9.014e-03 3.156e-04 -28.559 < 2e-16 ***
## pickup_dist_from_city 2.100e-03 4.928e-04 4.262 2.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.271 on 1224935 degrees of freedom
## Multiple R-squared: 0.9421, Adjusted R-squared: 0.9421
## F-statistic: 1.533e+06 on 13 and 1224935 DF, p-value: < 2.2e-16
The model summary tells us that all the independent variables are significant in predicting the final value of the dependent variable (fare amount). An R-squared of 94.37% is a pretty good measure of the model performance. We need to ensure we are not overfitting the model and there are no effects of multi-collinearity among the independent variables that can impact the model performance.
We will use root mean square error (RMSE) and R-squared (R2) to measure and test the model performance.
RMSE is the mean error on the predictions and tell us how far the predicted value is from actual. For this metric, the lower the error, the better is the model perfomance.
R2 is the measure of proportion of the variability in the data that can be explained by the variations in the independent variables. Ideally, 100% of the variation in the dependent variable must be explained by the variations in the independent variables. So, for this metric, the higher values suggests a better performing model.
# Make predictions
predictions <- predict(model_1, test_data)
# Model performance
data.table(
RMSE = RMSE(predictions, test_data$fare_amount),
R2 = R2(predictions, test_data$fare_amount)
)
## RMSE R2
## 1: 2.411707 0.9352333
To detect multicollinearity in a regression model, will test the Variance Inflation Factor (VIF) of each of the independent variables. The cut-off of VIF = 5 is used to determine the magnitude of multicollinearity and the variables that have VIF>5 are generally removed from the regression model.
vif(model_1)
## rate_code pickup_datetime dropoff_datetime
## 1.085621e+00 3.214340e+07 3.214314e+07
## passenger_count trip_time_in_secs trip_distance
## 1.000618e+00 1.701438e+01 3.305365e+00
## pickup_longitude pickup_latitude dropoff_longitude
## 1.410441e+00 1.196281e+00 1.196667e+00
## dropoff_latitude pickup_wday pickup_hour
## 1.135312e+00 1.006898e+00 1.009896e+00
## pickup_dist_from_city
## 1.379146e+00
The VIF for two of the independent variables is quite high. pickup_datetime and dropoff_datetime
Let’s remove these variable and try the regression model.
# Train the model
model_2<-lm(fare_amount~. -pickup_datetime-dropoff_datetime, data = train_data)
model_summary <- summary(model_2)
model_summary
##
## Call:
## lm(formula = fare_amount ~ . - pickup_datetime - dropoff_datetime,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442.79 -0.33 -0.07 0.20 163.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.205e+01 6.738e+00 -3.273 0.00106 **
## rate_code 2.116e+00 6.155e-03 343.836 < 2e-16 ***
## passenger_count -1.930e-02 1.481e-03 -13.032 < 2e-16 ***
## trip_time_in_secs 5.869e-03 6.466e-06 907.613 < 2e-16 ***
## trip_distance 1.916e+00 1.123e-03 1705.799 < 2e-16 ***
## pickup_longitude 6.542e-01 5.662e-02 11.554 < 2e-16 ***
## pickup_latitude 8.669e-01 6.121e-02 14.162 < 2e-16 ***
## dropoff_longitude -6.463e-01 5.453e-02 -11.852 < 2e-16 ***
## dropoff_latitude -3.064e-01 5.402e-02 -5.672 1.42e-08 ***
## pickup_wday -6.830e-04 9.910e-04 -0.689 0.49069
## pickup_hour -8.975e-03 3.155e-04 -28.450 < 2e-16 ***
## pickup_dist_from_city 2.065e-03 4.928e-04 4.190 2.79e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.272 on 1224937 degrees of freedom
## Multiple R-squared: 0.9421, Adjusted R-squared: 0.9421
## F-statistic: 1.812e+06 on 11 and 1224937 DF, p-value: < 2.2e-16
# Make predictions
predictions <- predict(model_2, test_data)
# Model performance
data.table(
RMSE = RMSE(predictions, test_data$fare_amount),
R2 = R2(predictions, test_data$fare_amount)
)
## RMSE R2
## 1: 2.411694 0.935234
vif(model_2)
## rate_code passenger_count trip_time_in_secs
## 1.085616 1.000542 2.659152
## trip_distance pickup_longitude pickup_latitude
## 3.303181 1.410422 1.196253
## dropoff_longitude dropoff_latitude pickup_wday
## 1.196648 1.135295 1.006494
## pickup_hour pickup_dist_from_city
## 1.008834 1.379057
Both the RMSE and R2 gets better by removing the two high VIF variables, but the change in model performance is not that significant. The VIF of model_2 improves. Hence, the model_2 is better than model_1
Let’s also look at predicting the fare amount solely based on trip distance and trip duration and see if the model performance improves or declines.
# Train the model
model_3<-lm(fare_amount~trip_distance+trip_time_in_secs, data = train_data)
model_summary <- summary(model_3)
model_summary
##
## Call:
## lm(formula = fare_amount ~ trip_distance + trip_time_in_secs,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -188.784 -0.349 -0.070 0.193 165.669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.101e+00 3.774e-03 556.7 <2e-16 ***
## trip_distance 1.990e+00 1.043e-03 1908.5 <2e-16 ***
## trip_time_in_secs 5.756e-03 6.692e-06 860.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.381 on 1224946 degrees of freedom
## Multiple R-squared: 0.9364, Adjusted R-squared: 0.9364
## F-statistic: 9.018e+06 on 2 and 1224946 DF, p-value: < 2.2e-16
# Make predictions
predictions <- predict(model_3, test_data)
# Model performance
data.table(
RMSE = RMSE(predictions, test_data$fare_amount),
R2 = R2(predictions, test_data$fare_amount)
)
## RMSE R2
## 1: 2.456246 0.9328061
vif(model_3)
## trip_distance trip_time_in_secs
## 2.59301 2.59301
The RMSE increases and R-squared decreases for the third model with just two independent variables, which makes the model_2 as the best model so far.
Next we try to fit a machine learning model to our data and see if we can get a better performing model. We are going to try Extreme gradient boosting (xgboot) model for our regression problem. Here we are fitting a xgboost model with randomly chosen hyperparameters using the same training dataset that we used earlier in the linear model and test the model performance using the same metrics and same test dataset that we have used earlier.
xg_train <- xgb.DMatrix(data.matrix(train_data[, names(train_data)[1:12],with=FALSE])
, label = train_data$fare_amount
, missing = NaN
)
xg_test <- xgb.DMatrix(data.matrix(test_data[, names(test_data)[1:12],with=FALSE])
, label = test_data$fare_amount
, missing = NaN
)
# Hyperparameter has been randomly chosen but can be tuned using a gridsearch method
model_4 <- xgboost(data = xg_train, # training data as matrix
label = train_data$fare_amount, # column of outcomes
nrounds = 100, # number of trees to build
objective = 'reg:squarederror', # objective
eta = 0.3,
depth = 6,
verbose = 0 # silent
)
# Make predictions
pred <- predict(model_4, xg_test)
data.table(
RMSE = RMSE(pred, test_data$fare_amount),
R2 = R2(pred, test_data$fare_amount)
)
## RMSE R2
## 1: 1.114159 0.9861759
We can see that the machine learning model performs much better than linear regression with a lower RMSE and better R-squared
xgb.importance(names(train_data)[1:12], model = model_4)
## Feature Gain Cover Frequency
## 1: trip_distance 8.579110e-01 0.336545247 0.201131999
## 2: trip_time_in_secs 9.810436e-02 0.208954398 0.208611280
## 3: rate_code 3.568263e-02 0.098816181 0.135435618
## 4: dropoff_longitude 3.321075e-03 0.142506303 0.106731352
## 5: dropoff_latitude 1.877586e-03 0.079409242 0.082474227
## 6: pickup_longitude 1.544009e-03 0.036183114 0.073984233
## 7: pickup_latitude 5.800135e-04 0.028651825 0.053972104
## 8: pickup_datetime 4.599046e-04 0.010846980 0.071154235
## 9: pickup_hour 3.364528e-04 0.025978723 0.035172832
## 10: pickup_wday 9.481826e-05 0.013578814 0.014756418
## 11: passenger_count 4.520735e-05 0.006634207 0.011724277
## 12: dropoff_datetime 4.292596e-05 0.011894966 0.004851425
The feature importance shows the top two features are trip_distance and trip_time_in_secs
As an owner, I could think of two approaches: * Target the peak demand hours * Target the peak earning hours while minimising travel time
The off-peak hours can be skipped and can be used for taxi maintenance.
Assuming an 8-hour shift, let’s see which starting hour of the shift in a day gives maximun earnings
sum_earnings_hr <- trip_combined_sample[, .(fare_amount_sum = sum(fare_amount)), by = list(pickup_hour)][order(pickup_hour)]
# Append first 7 hours earnings at the end to handle the corner cases of rolling sum
sum_earnings_hr <- rbind(sum_earnings_hr, head(sum_earnings_hr, 7))
sum_earnings_hr[, shift_earnings := frollsum(fare_amount_sum, n = 8, align = 'left')]
# Drop the records that were appended for handling corner cases
sum_earnings_hr <- sum_earnings_hr[!is.na(shift_earnings)]
ggplot(sum_earnings_hr, aes(reorder(pickup_hour, -shift_earnings), shift_earnings, fill = reorder(pickup_hour, -shift_earnings))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Hour of the day", y = "Total earnings") +
ggtitle("Total earnings for 8 hour shift by starting hour ") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
From the chart, it is clear that to maximise the earnings, it is better to start the 8-hour shift from 4pm (evenings)
Here the assumption is that work time is calculated by the time spent working on trips (trip durations).
Again assuming an 8-hour shift, let’s see which starting hour of the shift in a day gives more than average earnings by spending least amount of trip hours
sum_earnings_trips_hr <- trip_combined_sample[, .(fare_amount_sum = sum(fare_amount), trip_time_sum = sum(trip_time_in_secs)), by = list(pickup_hour)][order(pickup_hour)]
# Append first 7 hours earnings at the end to handle the corner cases of rolling sum
sum_earnings_trips_hr <- rbind(sum_earnings_trips_hr, head(sum_earnings_trips_hr, 7))
sum_earnings_trips_hr[, fare_amount_sum_shift := frollmean(fare_amount_sum, n = 8, align = 'left')]
sum_earnings_trips_hr[, trip_time_sum_shift := frollmean(trip_time_sum, n = 8, align = 'left')]
# Drop the records that were appended for handling corner cases
sum_earnings_trips_hr <- sum_earnings_trips_hr[!is.na(fare_amount_sum_shift)]
ggplot(sum_earnings_trips_hr[fare_amount_sum_shift>=mean(fare_amount_sum_shift)],
aes(reorder(pickup_hour, trip_time_sum_shift), trip_time_sum_shift, fill = reorder(pickup_hour, trip_time_sum_shift))) +
geom_col() +
scale_y_sqrt() +
labs(x = "Hour of the day", y = "Total trip durations") +
ggtitle("Total trip durations for 8 hour shift by starting hour") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
The chart shows if we start the 8-hour shift from 7pm (evenings), we will be spent the least time on taxi trips while still maintaining average earnings. Further, based on the peak-times, we can sort the top pickup locations that earns the most total fare.
Based on the given data: